set.seed(1)
required_packages <- c("tidyverse", "magrittr", "DBI", "bigrquery", "arrow","glue", "vroom","janitor", "gt", "ggwordcloud", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext", "pander", "wordcloud2", "stopwords", "magicfor", "gapminder", "arrow")
for(i in required_packages) { 
  if(!require(i, character.only = T)) {
    #  if package is not existing, install then load the package
    install.packages(i, dependencies = T)
  require(i, character.only = T)
  }
}
panderOptions('table.alignment.default', "left")
## quality of png's
dpi <- 750
## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15))
theme_update(plot.margin = margin(30, 30, 30, 30),
             plot.background = element_rect(color = "white",
                                            fill = "white"),
             plot.title = element_text(size = 20,
                                       face = "bold",
                                       lineheight = 1.05,
                                       hjust = .5,
                                       margin = margin(10, 0, 25, 0)),
             plot.title.position = "plot",
             plot.caption = element_text(color = "grey40",
                                         size = 9,
                                         margin = margin(20, 0, -20, 0)),
             plot.caption.position = "plot",
             axis.line.x = element_line(color = "black",
                                        size = .8),
             axis.line.y = element_line(color = "black",
                                        size = .8),
             axis.title.x = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(t = 20)),
             axis.title.y = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(r = 20)),
             axis.text = element_text(size = 11,
                                      color = "black",
                                      face = "bold"),
             axis.text.x = element_text(margin = margin(t = 10)),
             axis.text.y = element_text(margin = margin(r = 10)),
             axis.ticks = element_blank(),
             panel.grid.major.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.major.y = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.y = element_blank(),
             panel.spacing.x = unit(4, "lines"),
             panel.spacing.y = unit(2, "lines"),
             legend.position = "top",
             legend.title = element_text(,
                                         color = "black",
                                         size = 14,
                                         margin = margin(5, 0, 5, 0)),
             legend.text = element_text(
                                        color = "black",
                                        size = 11,
                                        margin = margin(4.5, 4.5, 4.5, 4.5)),
             legend.background = element_rect(fill = NA,
                                              color = NA),
             legend.key = element_rect(color = NA, fill = NA),
             #legend.key.width = unit(5, "lines"),
             #legend.spacing.x = unit(.05, "pt"),
             #legend.spacing.y = unit(.55, "pt"),
             #legend.margin = margin(0, 0, 10, 0),
             strip.text = element_text(face = "bold",
                                       margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
  theme(panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_line(size = .6,
                                          color = "#eaeaea"))
## theme settings for charts without y axis
theme_blank <- 
  theme(panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_blank(),
        axis.line.y = element_blank(),
        axis.text.y = element_blank())

## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")

Without 0 volume included

get_count_range <-  function(lower, higher)
{
  sql <- glue(
        "SELECT COUNT(*) AS count, SUM(COALESCE(keyword_info_search_volume / 10000, 0)) AS volume  
         FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
         WHERE location = 2840 
         AND spell = ''
         AND keyword_info_search_volume > {lower} 
         AND keyword_info_search_volume <= {higher}")
      tb <- bq_project_query("dataforseo-bigquery", sql)
      list(count = bq_table_download(tb)$count, volume = bq_table_download(tb)$volume)
}
df <- tribble(
  ~cat, ~count,
  "1 - 10", get_count_range(0, 10),
  "10 - 100", get_count_range(10, 100),
  "100 - 1000", get_count_range(100, 1000),
  "1000 - 10K", get_count_range(1000, 10000),
  "10K - 100K", get_count_range(10000, "100000"),
  "100K+", get_count_range("100001", "100000000000")) 

cat6 <- df %>% unnest_wider(count) %>% 
    mutate(volume_prop = volume / sum(volume),
           count_prop = count / sum(count)) %>% 
    mutate(cat = factor(cat, levels = c("1 - 10", "10 - 100", "100 - 1000", "1000 - 10K",
                                        "10K - 100K", "100K+")))
cat6 %>% 
  ggplot(aes(x = cat, y = volume_prop)) +
  geom_bar(stat = "identity", fill = bl_dark, color = "black", width = .85) +
  geom_text(aes(label = glue::glue("{format(round(volume_prop, 3) * 100, scientific = FALSE)}%")),
            nudge_y = .04, family = "Montserrat", fontface = "bold", color = "grey40", size = 3) +
  theme(panel.grid.major.x = element_blank()) +
  labs(x = "Search volume category", y = "Percentage of all search volume", title = "Volume of searches") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 0.7), expand = c(0.001, 0.001))

cat6 %>% 
  ggplot(aes(x = cat, y = count_prop)) +
  geom_bar(stat = "identity", fill = bl_dark, color = "black", width = .85) +
  geom_text(aes(label = glue::glue("{format(round(count_prop, 3) * 100, scientific = FALSE)}%")),
            nudge_y = .04, family = "Montserrat", fontface = "bold", color = "grey40", size = 3) +
  theme(panel.grid.major.x = element_blank()) +
  labs(x = "Search volume category", y = "Percentage of all searches", title = "Number of searches") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 0.7), expand = c(0.001, 0.001))

cat4 <-  cat6 %>% mutate(cat = 
    case_when(
        cat == "1 - 10" ~ "1 - 100",
        cat == "10 - 100" ~ "1 - 100",
        cat == "100 - 1000" ~ "100 - 1000",
        cat == "1000 - 10K" ~ "1000 - 10K",
        cat == "10K - 100K" ~ "10K+",
        cat == "100K+" ~ "10K+",
    )) %>% 
    group_by(cat) %>% 
    summarise(across(everything(), sum))
cat4 %>% 
  ggplot(aes(x = cat, y = volume_prop)) +
  geom_bar(stat = "identity", fill = bl_dark, color = "black", width = .85) +
  geom_text(aes(label = glue::glue("{format(round(volume_prop, 3) * 100, scientific = FALSE)}%")),
            nudge_y = .04, family = "Montserrat", fontface = "bold", color = "grey40", size = 3) +
  theme(panel.grid.major.x = element_blank()) +
  labs(x = "Search volume category", y = "Percentage of all search volume", title = "Volume of searches") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1), expand = c(0.001, 0.001))

cat4 %>% 
  ggplot(aes(x = cat, y = count_prop)) +
  geom_bar(stat = "identity", fill = bl_dark, color = "black", width = .85) +
  geom_text(aes(label = glue::glue("{format(round(count_prop, 3) * 100, scientific = FALSE)}%")),
            nudge_y = .04, family = "Montserrat", fontface = "bold", color = "grey40", size = 3) +
  theme(panel.grid.major.x = element_blank()) +
  labs(x = "Search volume category", y = "Percentage of all searches", title = "Number of searches") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1), expand = c(0.001, 0.001))

With 0 volume included

get_count_range <-  function(lower, higher)
{
  sql <- glue(
        "SELECT COUNT(*) AS count, SUM(COALESCE(keyword_info_search_volume / 10000, 0)) AS volume  
         FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
         WHERE location = 2840 
         AND spell = ''
         AND keyword_info_search_volume >= {lower} 
         AND keyword_info_search_volume <= {higher}")
      tb <- bq_project_query("dataforseo-bigquery", sql)
      list(count = bq_table_download(tb)$count, volume = bq_table_download(tb)$volume)
}
df <- tribble(
  ~cat, ~count,
  "0 - 10", get_count_range(0, 10),
  "10 - 100", get_count_range(10, 100),
  "100 - 1000", get_count_range(100, 1000),
  "1000 - 10K", get_count_range(1000, 10000),
  "10K - 100K", get_count_range(10000, "100000"),
  "100K+", get_count_range("100001", "100000000000")) 

cat6 <- df %>% unnest_wider(count) %>% 
    mutate(volume_prop = volume / sum(volume),
           count_prop = count / sum(count)) %>% 
    mutate(cat = factor(cat, levels = c("0 - 10", "10 - 100", "100 - 1000", "1000 - 10K",
                                        "10K - 100K", "100K+")))
cat6 %>% 
  ggplot(aes(x = cat, y = volume_prop)) +
  geom_bar(stat = "identity", fill = bl_dark, color = "black", width = .85) +
  geom_text(aes(label = glue::glue("{format(round(volume_prop, 3) * 100, scientific = FALSE)}%")),
            nudge_y = .04, family = "Montserrat", fontface = "bold", color = "grey40", size = 3) +
  theme(panel.grid.major.x = element_blank()) +
  labs(x = "Search volume category", y = "Percentage of all search volume", title = "Volume of searches") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 0.7), expand = c(0.001, 0.001))

cat6 %>% 
  ggplot(aes(x = cat, y = count_prop)) +
  geom_bar(stat = "identity", fill = bl_dark, color = "black", width = .85) +
  geom_text(aes(label = glue::glue("{format(round(count_prop, 3) * 100, scientific = FALSE)}%")),
            nudge_y = .04, family = "Montserrat", fontface = "bold", color = "grey40", size = 3) +
  theme(panel.grid.major.x = element_blank()) +
  labs(x = "Search volume category", y = "Percentage of all searches", title = "Number of searches") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 0.7), expand = c(0.001, 0.001))

cat4 <-  cat6 %>% mutate(cat = 
    case_when(
        cat == "0 - 10" ~ "0 - 100",
        cat == "10 - 100" ~ "0 - 100",
        cat == "100 - 1000" ~ "100 - 1000",
        cat == "1000 - 10K" ~ "1000 - 10K",
        cat == "10K - 100K" ~ "10K+",
        cat == "100K+" ~ "10K+",
    )) %>% 
    group_by(cat) %>% 
    summarise(across(everything(), sum))
cat4 %>% 
  ggplot(aes(x = cat, y = volume_prop)) +
  geom_bar(stat = "identity", fill = bl_dark, color = "black", width = .85) +
  geom_text(aes(label = glue::glue("{format(round(volume_prop, 3) * 100, scientific = FALSE)}%")),
            nudge_y = .04, family = "Montserrat", fontface = "bold", color = "grey40", size = 3) +
  theme(panel.grid.major.x = element_blank()) +
  labs(x = "Search volume category", y = "Percentage of all search volume", title = "Volume of searches") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1), expand = c(0.001, 0.001))

cat4 %>% 
  ggplot(aes(x = cat, y = count_prop)) +
  geom_bar(stat = "identity", fill = bl_dark, color = "black", width = .85) +
  geom_text(aes(label = glue::glue("{format(round(count_prop, 3) * 100, scientific = FALSE)}%")),
            nudge_y = .04, family = "Montserrat", fontface = "bold", color = "grey40", size = 3) +
  theme(panel.grid.major.x = element_blank()) +
  labs(x = "Search volume category", y = "Percentage of all searches", title = "Number of searches") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1), expand = c(0.001, 0.001))

con <- dbConnect(
    bigrquery::bigquery(),
    project = "dataforseo-bigquery",
    billing = "dataforseo-bigquery"
)
sql <- glue("SELECT SUM(COALESCE(keyword_info_search_volume / 10000, 0)) AS volume 
FROM `dataforseo-bigquery.dataforseo_data.keyword_data` 
          WHERE location = 2840 
          AND keyword_info_search_volume > 0
          AND spell = ''")
tb <- bq_project_query("dataforseo-bigquery", sql)
top <- bq_table_download(tb)
total_volume <- top$volume * 10000
con <- dbConnect(
    bigrquery::bigquery(),
    project = "dataforseo-bigquery",
    billing = "dataforseo-bigquery"
)
sql <- glue("SELECT * FROM `dataforseo-bigquery.dataforseo_data.keyword_data` 
          WHERE location = 2840 
          AND keyword_info_search_volume > 0
          AND spell = ''
          ORDER BY keyword_info_search_volume DESC
          LIMIT 50000")
tb <- bq_project_query("dataforseo-bigquery", sql)
top <- bq_table_download(tb, max_results = 50000)

based on DataForSEO

volume_top <- top %>%  
  add_rownames() %>% 
  mutate(rowname = as.numeric(rowname)) %>% 
  select(rowname, volume = keyword_info_search_volume)
ylab <- c(50, 100, 150, 200)
volume_top %>% 
  filter(rowname < 10000) %>% 
  mutate(cat = case_when(
    rowname < 500 ~ "Top 500",
    rowname < 2000 ~ "Top 2000",
    rowname < 10000 ~ "Top 10000"
  )) %>% 
  mutate(cat = factor(cat, levels = c("Top 500", "Top 2000", "Top 10000"))) %>% 
  head(10500) %>% 
  ggplot(aes(x = rowname, y = volume, fill = cat)) +
  geom_area(alpha = 0.8) +
    scale_y_continuous(labels = scales::percent) +
  scale_y_continuous(
    labels = glue("{ylab} M"),
    breaks = 10^6 * ylab,
    limits = c(0, 200* 10^6), 
    expand = c(.001, .001)
    ) +
  labs(x = NULL, title = "Volume of top searches", fill = "") + 
  annotate("text", x = 9000, y = 65*10^6, label = "Remaining\n99.9967%", family = "Montserrat") + 
  geom_segment(aes(x = 7900, y = 42*10^6, xend = 10000, yend = 42*10^6),
               arrow = arrow(length = unit(0.35, "cm"), type = "closed")) +
  scale_x_continuous(expand = c(.001, .001), limits = c(0, 10000)) +
  scale_fill_manual(values = c(bl_dark, bl_col, "grey75")) +
  theme(axis.text.x = element_blank())# +

  #ggsave(here::here("plots", "reworked", "volume_by_searches.pdf"),
  #       width = 10, height = 6, device = cairo_pdf)

Based on ahref

top100 <- read_csv("../plots/csv/top1k_table.csv") %>% head(100)

top100 %>% add_rownames() %>% 
    mutate(rowname = as.numeric(rowname)) %>% 
    ggplot(aes(x = rowname, y = volume)) +
    geom_area(alpha = 0.8, fill = bl_dark) +
    labs(x = NULL, title = "Volume of top 100 searches", y = "Volume") +
    scale_y_continuous(labels = c("0%", "0.1%", "0.2%", "0.3%", "0.4%", "0.5%", "0.6%"), limits = c(0, 0.006), expand = c(0,0)) +
    scale_x_continuous(limits = c(1, 100), expand = c(0,0)) +
    theme(axis.text.x = element_blank())